#!/usr/bin/python
#
# Copyright (c) 2010 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

import os
import sys

# Prepend the buildbot pylibs directory to our import path.
sys.path.reverse()
sys.path.append(os.path.join(os.path.dirname(__file__), 
                             '../../buildbot/pylibs'))
sys.path.reverse()

import fnmatch
import glob
import itertools
import math
import optparse
import re
import simplejson
import subprocess

__version__ = '1.0'
USAGE = ""


def GetSummaryFilelist(dir=None):
  """Finds all summary .dat files to clean up."""
  if not dir:
    raise Exception("No directory supplied.")
  if not os.path.exists(dir):
    raise Exception("Directory does not exist.")
  files = itertools.chain(glob.iglob('%s/*' % dir),
                          glob.iglob('%s/*/*' % dir),
                          glob.iglob('%s/*/*/*' % dir))
  summaries = [summary for summary in files
               if fnmatch.fnmatch(summary, '*-summary.dat')]
  summaries.sort()
  return summaries


def ReadJson(filename):
  """Read a JSON file and convert its contents into a Python datatype."""
  try:
    file = open(filename, 'r')
  except IOError, e:
    print >> sys.stderr, ("I/O Error reading file %s(%s): %s" %
                          (filename, e.errno, e.strerror))
    raise e
  if not file:
    return None

  data = []
  contents = file.read()
  contentslist = contents.split("\n")
  for jsontext in contentslist:
    if jsontext is None or len(jsontext) == 0:
      continue
    try:
      json = simplejson.loads(jsontext,
                              object_pairs_hook=simplejson.OrderedDict)
    except ValueError, e:
      print >> sys.stderr, ("Error parsing file %s: '%s'" %
                            (filename, jsontext))
      raise e
    data.append(json)
  file.close()
  return data


def ConvertDataIntoBuckets(perfid, test, graph, data):
  """Convert an array of JSON data into a groomed collection of perf data.

  For each item in the JSON data array, locate the '_ref' trace which indicates
  the reference build's data.  Locate the current build's trace corresponding to
  the reference build.  Store both values in new arrays within the groomed
  collection under a key composed of the perf system ID, the test ID, the graph
  ID, and the trace name.
  
  Assume that the data has stabilized recently and only keep the first 30
  values for each bucket."""
  buckets = {}
  for cl in data:
    keys = cl['traces'].keys()
    for refkey in keys:
      m = re.match(r'^(.*)_ref$', refkey)
      if not m:
        continue
      tracekey = m.group(1)
      if tracekey not in cl['traces']:
        # Sometimes the current build measure is missing due to a failure.
        continue

      perfkey = "%s/%s/%s/%s" % (perfid, test, graph, tracekey)
      buckets.setdefault(perfkey, {})

      # Get current build data.
      buckets[perfkey].setdefault('current', [])
      current = float(cl['traces'][tracekey][0])
      buckets[perfkey]['current'].append(current)

      # Get reference build data.
      buckets[perfkey].setdefault('ref', [])
      ref = float(cl['traces'][refkey][0])
      buckets[perfkey]['ref'].append(ref)

      # Get delta.
      buckets[perfkey].setdefault('delta', [])
      buckets[perfkey]['delta'].append(current - ref)

  # Only keep the first 30 values for each bucket.
  buckets[perfkey]['current'] = buckets[perfkey]['current'][:30]
  buckets[perfkey]['ref'] = buckets[perfkey]['ref'][:30]
  buckets[perfkey]['delta'] = buckets[perfkey]['delta'][:30]
  return buckets


def GetDeltas(data, value=None):
  """Returns a list composed by subtracting each item by a given value.

  If the value is not given or is None, the previous item is used and the
  returned list will be one item less than the given list."""
  deltas = []
  for i in range(len(data)):
    if value is None:
      if i == 0:
        continue
      value = data[i-1]
    deltas.append(data[i] - value)
  return deltas


def GetAbsoluteValueList(data):
  """Returns a list composed of the absolute value of each item."""
  absv = []
  for i in range(len(data)):
    absv.append(abs(data[i]))
  return absv


def GetNormalizedList(data, base_value):
  """Returns a list composed by dividing each item over the given base value.

  If the base value is less than 0.0001, it is adjusted to 0.0001 to avoid
  division by zero."""
  if base_value < 0.0001:
    base_value = 0.0001
  normalized = []
  for i in range(len(data)):
    normalized.append(data[i] / float(base_value))
  return normalized


def GetMedian(data):
  """Returns the median for a list of data."""
  if len(data) == 0:
    return None
  sorted_data = sorted(data)
  if len(sorted_data) % 2 == 0:
    high = sorted_data[len(sorted_data) / 2]
    low = sorted_data[(len(sorted_data) / 2) - 1]
    return (float(high) + float(low)) / 2
  else:
    return sorted_data[len(sorted_data) / 2]


def RemoveNoise(data, max_to_remove=3):
  """Removes noise from a list by identifying potentially unusual values.

  This is a simple filter.  The process is:
    1. Calculate the list's b coefficient using the least fit squares algorithm.
    2. Create a list of deltas using the b coefficient and take its absolute
       value.
    3. Normalize the deltas list over the median value.
    4. For each normalized delta, if that value is greater than 5, mark that
       position, value, and delta as a candidate for removal.

  The default number of noisy values to remove is 5 unless otherwise overridden.
  This method sorts the candidates by the delta descending, then reduces the
  candidate list to the maximum number allowed.  The candidate values are
  then removed from the input list."""
  mean = sum(data) / len(data)
  deltas = GetAbsoluteValueList(GetDeltas(data, mean))
  normalized_deltas = GetNormalizedList(deltas, GetMedian(deltas))

  candidates = []
  for i in range(len(normalized_deltas)):
    if normalized_deltas[i] > 3:
      candidates.append({'position': i,
                         'value': data[i],
                         'delta': normalized_deltas[i]})
  # Sort by delta descending, then drop all but max_to_remove elements.
  candidates.sort(cmp=lambda a, b: cmp(b['delta'], a['delta']))
  candidates = candidates[:max_to_remove]
  # Sort by position descending, then pop those items from the list.
  candidates.sort(cmp=lambda a, b: cmp(b['position'], a['position']))
  for candidate in candidates:
    data.pop(candidate['position'])


def CalculateDeltaAndVar(perfkey, buckets, expectations):
  """Calculate the delta and variance for a bucket.

  Store the resulting delta and variance in the expectations dictionary."""
  data = buckets[perfkey]['delta']

  # Calculate the delta of the data set by averaging the observed max and min
  # values.
  delta_max = round(max(data), 0)
  delta_min = round(min(data), 0)
  delta = int((delta_max + delta_min)/2.0)

  # Get the mean and median of the data set.
  mean = sum(data) / len(data)
  median = GetMedian(data)

  # Find the noise level by tripling the median/mean difference.  Also see
  # RemoveNoise().
  buffer = abs((median-mean)*3)

  # Get the max allowed point by subtracting the delta from the max delta, and
  # adding the buffer.
  max_allowed_point = abs(delta_max - delta + buffer)

  # The variance is the max allowed point divided by 1.5 (the amount the
  # variance is multiplied by in the Buildbot).
  var = int(round(max_allowed_point/1.5 + 1, 0))
  if var < 0:
    raise Exception("variance for %s is negative" % perfkey)
  expectations[perfkey] = {'delta': delta, 'var': var}


def WriteJson(filename, data):
  """Write a list of hashes in |data| to the file specified in |filename|."""
  try:
    file = open(filename, 'w')
  except IOError, e:
    print >> sys.stderr, ("I/O Error writing file %s(%s): %s" %
                          (filename, e.errno, e.strerror))
  if file:
    contentslist = []
    contentslist.append('{')
    keys = data.keys()
    keys.sort()
    for json in keys:
      delta = '{"delta": %s, "var": %s}' % (data[json]['delta'],
                                            data[json]['var'])
      contentslist.append('  "%s": %s,' % (json, delta))
    contentslist.append('  "load": true')
    contentslist.append('}')
    contents = "\n".join(contentslist) + "\n"
    file.write(contents)
  return True


def IsWhitelistedPath(path):
  """Check path against a whitelisted set of regular expressions."""
  whitelist = [r'^(.*/)?([^/]+)/([^/]+)/times-summary.dat',
               r'^(.*/)?xp-release-dual-core/moz/([^/]+)-summary.dat',
               r'^(.*/)?([^/]+)/dromaeo/score-summary.dat',
               r'^(.*/)?([^/]+)/startup/warm-summary.dat']
  for match in whitelist:
    if re.match(match, path):
      return True
  return False


def IsBlacklistedPath(path):
  """Check path against a blacklisted set of regular expressions."""
  blacklist = [r'^(.*/)?linux-release/([^/]+)/([^/]+)-summary.dat',
               r'^(.*/)?linux-release-lowmem/([^/]+)/([^/]+)-summary.dat',
               r'^(.*/)?linux-release-w([^/]+)/([^/]+)/([^/]+)-summary.dat',
               r'^(.*/)?mac-release-10.5-([^/]+)/([^/]+)/([^/]+)-summary.dat',
               r'^(.*/)?mac-release/([^/]+)/([^/]+)-summary.dat',
               r'^(.*/)?vista-release-single-core/([^/]+)/([^/]+)-summary.dat',
               r'^(.*/)?xp-release-single-core/([^/]+)/([^/]+)-summary.dat',
               r'^(.*/)?xp-release-v8-latest/([^/]+)/([^/]+)-summary.dat',
               r'^(.*/)?xp-release-webkit-latest/([^/]+)/([^/]+)-summary.dat',
               r'^(.*/)?xp-release/([^/]+)/([^/]+)-summary.dat']
  for match in blacklist:
    if re.match(match, path):
      return True
  return False


def Main(args):
  parser = optparse.OptionParser(usage=USAGE, version=__version__)
  options, args = parser.parse_args(args)

  # Get the given directories the user wants to work in.
  options.dir = []
  if len(args) > 1:
    options.dir.extend(args[1:len(args)])
  # If no directories are given, assume the current working directory.
  if len(options.dir) == 0:
    options.dir.append('.')

  expectations = {}
  for dir in options.dir:
    for filename in GetSummaryFilelist(dir=dir):
      if not IsWhitelistedPath(filename) or IsBlacklistedPath(filename):
        continue

      # Get the perfid, test, and graph from filename.
      print filename
      m = re.match(r'^(.*/)?([^/]+)/([^/]+)/([^/]+)-summary.dat', filename)
      if not m:
        raise Exception('%s did not match expected format' % filename)
      perfid = m.group(2)
      test = m.group(3)
      graph = m.group(4)

      # Only look at the first 200 lines since some dat files contain old trace
      # names we can safely skip.
      jsondata = ReadJson(filename)[:200]
      buckets = ConvertDataIntoBuckets(perfid, test, graph, jsondata)
      for perfkey in buckets:
        RemoveNoise(buckets[perfkey]['delta'])
        CalculateDeltaAndVar(perfkey, buckets, expectations)
  WriteJson('perf_expectations.json', expectations)
  return 0


if __name__ == '__main__':
  sys.exit(Main(sys.argv))
